import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.stats import zscore
In class Assignment Expectations/Steps -
Apply Data Cleaning to the Datasets and then apply Kmeans algorithm for find pattern and the best value of.the K for the following features.
df=pd.read_csv('winequality-red.csv')
df.head()
df.shape
df.info()
we can see all the columns are of the numerical data type
df.isnull().sum()
we are here checking for the null values in the dataset and we can say that there are no null values in the data
df1=df.copy()
df1.head()
making a copy of the original data such that the operations done does not affect the original data
df1.drop('quality', inplace=True, axis=1)
here, we are dropping the target column quality
import seaborn as sns
sns.pairplot(df1,diag_kind='kde')
by seeing the pairplot we can see the kernal density plots and here if we see the small bumps on the curves which shows number of clusters
sns.pairplot(df,diag_kind='kde', hue='quality')
the pairplot includes the target variable where we can see the bumps on the curves as the number of clusters.
plt.figure(figsize=(10,10))
sns.heatmap(df1.corr(), annot=True)
this is a corelation heat map of all the columns fixed acidity,citric acid and density, fixed acidity are having the highest positive corelation, citric acid, ph and fixed acidity,ph are having the strong negative corelation.
df1_scaled = df1.apply(zscore)
df1_scaled.head()
we are here standardizing the data using the z score
model = KMeans(n_clusters = 3)
model
we are building a simple model by using the number of clusters as 3
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df1_scaled)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
we are building model based on 3 clusters and also we are calculating the cluster errors
The total sum of squared distances of every data point from respective centroid is also called inertia. Let us print the inertia value for all K values. That K at which the inertia stop to drop significantly (elbow method) will be the best K.
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
the elbow plot gives us the number of clusters to be used as we can see that the curve is slightly bent at 6 so we consider the number of clusters as 6 to build our model
kmeans = KMeans(n_clusters=6, n_init = 15, random_state=2345)
kmeans.fit(df1_scaled)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df1_scaled) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df1_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df1_labels['labels'] = df1_labels['labels'].astype('category')
# Joining the label dataframe with the Wine data frame to create wine_df_labeled. Note: it could be appended to original dataframe
snail_df1_labeled = df1.join(df1_labels)
df1_analysis = (snail_df1_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df1_analysis
snail_df1_labeled['labels'].value_counts()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8, 6))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=100)
kmeans.fit(df1_scaled)
labels = kmeans.labels_
ax.scatter(df1_scaled.iloc[:, 0], df1_scaled.iloc[:, 1], df1_scaled.iloc[:, 3],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('3D plot of KMeans Clustering')
Use feautes fixed acidity and volatile acidity
df2 = df1_scaled.loc[:, 'fixed_acidity': 'volatile_acidity']
df2.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df2)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
the value of k is 3 in this case
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df2)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df2) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
snail_df_labeled = df2.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()
Use feautes Cirtic acidity and fixed acidity
df3 = df1_scaled[[ 'citric_acid','fixed_acidity']]
df3.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df3)
# labels = clusters.labels_
# centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df3)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df3) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
snail_df_labeled = df3.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()
#3 Use feautes residual suger and sulphades
df.columns
df4 = df1_scaled[[ 'residual_sugar','sulphates']]
df4.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df4)
# labels = clusters.labels_
# centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
here we are taking k value as 5
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df4)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df4) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
# Joining the label dataframe with the Wine data frame to create wine_df_labeled. Note: it could be appended to original dataframe
snail_df_labeled = df2.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()
df.columns
df5 = df1_scaled[[ 'free_sulfur_dioxide','total_sulfur_dioxide']]
df5.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df5)
# labels = clusters.labels_
# centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
here we are taking k value as 2
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df4)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df4) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
# Joining the label dataframe with the Wine data frame to create wine_df_labeled. Note: it could be appended to original dataframe
snail_df_labeled = df2.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()
Use feautes fixed acidity, citric acidity and volatile acidity
df.columns
df6 = df1_scaled[[ 'fixed_acidity', 'volatile_acidity', 'citric_acid']]
df6.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df6)
# labels = clusters.labels_
# centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
here we are taking k value as 2
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df6)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df6) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
# Joining the label dataframe with the Wine data frame to create wine_df_labeled. Note: it could be appended to original dataframe
snail_df_labeled = df2.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()
df.columns
df7 = df1_scaled[[ 'density', 'pH']]
df7.head()
model = KMeans(n_clusters = 3)
model
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(df7)
# labels = clusters.labels_
# centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
here we are taking k value as 4
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
kmeans.fit(df7)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(df7) )
centroid_df
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
# Joining the label dataframe with the Wine data frame to create wine_df_labeled. Note: it could be appended to original dataframe
snail_df_labeled = df2.join(df_labels)
df_analysis = (snail_df_labeled.groupby(['labels'] , axis=0)).head(4177) # the groupby creates a groupeddataframe that needs
# to be converted back to dataframe. I am using .head(30000) for that
df_analysis
snail_df_labeled['labels'].value_counts()